import warnings
warnings.filterwarnings('ignore')
import pandas as pd
df = pd.read_csv("sku_transactions_1.csv")
df
# https://www.kaggle.com/datasets/marian447/retail-store-sales-transactions
| Date | Customer_ID | Transaction_ID | SKU | Quantity | Sales_Amount | |
|---|---|---|---|---|---|---|
| 0 | 02/01/2016 | 5541 | 4892 | 29DS1 | 2.0 | 18.67 |
| 1 | 02/01/2016 | 3403 | 4956 | VW5YU | 2.0 | 18.27 |
| 2 | 02/01/2016 | 9584 | 4955 | P8FMZ | 1.0 | 52.45 |
| 3 | 02/01/2016 | 9584 | 4955 | LEYMS | 2.0 | 21.98 |
| 4 | 02/01/2016 | 8411 | 4954 | RH5J5 | 1.0 | 4.77 |
| ... | ... | ... | ... | ... | ... | ... |
| 63985 | 30/06/2016 | 16993 | 32275 | 1H4DM | 1.0 | 37.63 |
| 63986 | 30/06/2016 | 16993 | 32275 | QGK3S | 1.0 | 5.69 |
| 63987 | 30/06/2016 | 21463 | 32277 | BMGRM | 1.0 | 6.96 |
| 63988 | 30/06/2016 | 16993 | 32275 | LPLTZ | 4.0 | 15.60 |
| 63989 | 30/06/2016 | 16993 | 32275 | XFAEF | 1.0 | 6.90 |
63990 rows × 6 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 63990 entries, 0 to 63989 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 63990 non-null object 1 Customer_ID 63990 non-null int64 2 Transaction_ID 63990 non-null int64 3 SKU 63990 non-null object 4 Quantity 63990 non-null float64 5 Sales_Amount 63990 non-null float64 dtypes: float64(2), int64(2), object(2) memory usage: 2.9+ MB
df['Customer_ID'] = df['Customer_ID'].astype(str)
df['Transaction_ID'] = df['Transaction_ID'].astype(str)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 63990 entries, 0 to 63989 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 63990 non-null object 1 Customer_ID 63990 non-null object 2 Transaction_ID 63990 non-null object 3 SKU 63990 non-null object 4 Quantity 63990 non-null float64 5 Sales_Amount 63990 non-null float64 dtypes: float64(2), object(4) memory usage: 2.9+ MB
# Describe
df.describe()
| Quantity | Sales_Amount | |
|---|---|---|
| count | 63990.000000 | 63990.000000 |
| mean | 1.463645 | 11.907501 |
| std | 3.368498 | 19.128415 |
| min | 0.100000 | 0.020000 |
| 25% | 1.000000 | 4.190000 |
| 50% | 1.000000 | 6.870000 |
| 75% | 1.000000 | 12.270000 |
| max | 176.000000 | 531.270000 |
# Explore categorical data
print(f"Number of unique transactions {df['Transaction_ID'].nunique()}")
print(f"Number of unique customers {df['Customer_ID'].nunique()}")
print(f"Number of unique SKU items {df['SKU'].nunique()}")
print(f"Period of invoices {df['Date'].min()} - {df['Date'].max()}")
print(f'SKU items in data set:')
df['SKU'].value_counts()
Number of unique transactions 31694 Number of unique customers 15113 Number of unique SKU items 4448 Period of invoices 01/02/2016 - 31/05/2016 SKU items in data set:
UNJKW 911
COWU2 432
C6TXL 361
OV1P9 344
M6J9W 308
...
24X9E 1
DGQ1A 1
2EIBT 1
SD3UI 1
KH5IL 1
Name: SKU, Length: 4448, dtype: int64
# df['Transaction_ID'].is_unique
df = df.set_index(['Transaction_ID', 'SKU'])
# df.index.is_unique
#show index
df.index
MultiIndex([( '4892', '29DS1'),
( '4956', 'VW5YU'),
( '4955', 'P8FMZ'),
( '4955', 'LEYMS'),
( '4954', 'RH5J5'),
( '4953', '7UQEH'),
( '4952', '4D95F'),
( '4951', 'JE62Y'),
( '4951', 'DWJC4'),
( '4951', 'W53F2'),
...
('32276', '7IE9S'),
('32275', 'OI6EH'),
('32275', 'LJ26I'),
('32275', '2984Y'),
('32275', 'LG6Y2'),
('32275', '1H4DM'),
('32275', 'QGK3S'),
('32277', 'BMGRM'),
('32275', 'LPLTZ'),
('32275', 'XFAEF')],
names=['Transaction_ID', 'SKU'], length=63990)
# Get all details of specific transaction
df.loc['32408',:]
| Date | Customer_ID | Quantity | Sales_Amount | |
|---|---|---|---|---|
| SKU | ||||
| 72CDS | 30/06/2016 | 15632 | 1.0 | 0.82 |
| JZJII | 30/06/2016 | 15632 | 1.0 | 0.18 |
# Range of transactions
df = df.sort_index()
df.loc['32275':'32276',:]
| Date | Customer_ID | Quantity | Sales_Amount | ||
|---|---|---|---|---|---|
| Transaction_ID | SKU | ||||
| 32275 | 1H4DM | 30/06/2016 | 16993 | 1.0 | 37.63 |
| 2984Y | 30/06/2016 | 16993 | 3.0 | 11.65 | |
| 360JS | 30/06/2016 | 16993 | 3.0 | 8.18 | |
| LG6Y2 | 30/06/2016 | 16993 | 3.0 | 22.09 | |
| LJ26I | 30/06/2016 | 16993 | 3.0 | 64.97 | |
| LPLTZ | 30/06/2016 | 16993 | 4.0 | 15.60 | |
| NDCX9 | 30/06/2016 | 16993 | 1.0 | 2.84 | |
| OI6EH | 30/06/2016 | 16993 | 1.0 | 27.67 | |
| QGK3S | 30/06/2016 | 16993 | 1.0 | 5.69 | |
| XFAEF | 30/06/2016 | 16993 | 1.0 | 6.90 | |
| 32276 | 7IE9S | 30/06/2016 | 16993 | 1.0 | 8.38 |
| ZVTO4 | 30/06/2016 | 16993 | 1.0 | 4.57 |
# Get all trandactions of specifc SKU
df.loc[:,'F9JVE',:].head()
| Date | Customer_ID | Quantity | Sales_Amount | |
|---|---|---|---|---|
| Transaction_ID | ||||
| 10765 | 03/06/2016 | 9030 | 1.0 | 3.53 |
| 14503 | 25/03/2016 | 7090 | 1.0 | 3.53 |
| 14859 | 29/03/2016 | 17691 | 1.0 | 3.74 |
| 19345 | 22/04/2016 | 3143 | 1.0 | 3.53 |
| 20413 | 27/04/2016 | 17196 | 1.0 | 3.53 |
# Get all trandactions of range of SKUs
df.loc[:,'DUV2Y':'F9JVE',:]
| Date | Customer_ID | Quantity | Sales_Amount | ||
|---|---|---|---|---|---|
| Transaction_ID | SKU | ||||
| 10004 | F79YP | 03/02/2016 | 3546 | 1.0 | 14.25 |
| 10007 | E3PAN | 03/02/2016 | 4919 | 1.0 | 9.68 |
| F7FQ5 | 03/02/2016 | 4919 | 3.0 | 7.29 | |
| 10012 | F64H7 | 03/02/2016 | 2465 | 1.0 | 2.86 |
| 10015 | EMJ1S | 03/02/2016 | 7474 | 1.0 | 3.68 |
| ... | ... | ... | ... | ... | ... |
| 9973 | ECUP5 | 03/01/2016 | 1288 | 1.0 | 12.48 |
| 9982 | F3M35 | 03/01/2016 | 8688 | 1.0 | 9.63 |
| 9984 | DWJC4 | 03/01/2016 | 3767 | 1.0 | 12.87 |
| 9988 | F90L2 | 03/01/2016 | 4888 | 1.0 | 2.37 |
| 9997 | DWYF6 | 03/01/2016 | 9165 | 1.0 | 11.93 |
3415 rows × 4 columns
df.groupby(level="Transaction_ID").sum() # sum per transaction
| Quantity | Sales_Amount | |
|---|---|---|
| Transaction_ID | ||
| 1 | 1.0 | 3.13 |
| 10 | 2.0 | 7.49 |
| 100 | 1.0 | 3.34 |
| 10000 | 1.0 | 15.67 |
| 10001 | 1.0 | 8.37 |
| ... | ... | ... |
| 9995 | 2.0 | 10.81 |
| 9996 | 13.0 | 88.23 |
| 9997 | 1.0 | 11.93 |
| 9998 | 1.0 | 4.57 |
| 9999 | 3.0 | 43.83 |
31694 rows × 2 columns
df.groupby(level="SKU").sum()
| Quantity | Sales_Amount | |
|---|---|---|
| SKU | ||
| 00GVC | 2.0 | 35.36 |
| 00OK1 | 13.0 | 16.56 |
| 0121I | 16.0 | 32.47 |
| 01IEO | 24.0 | 149.21 |
| 01IQT | 5.0 | 6.84 |
| ... | ... | ... |
| ZWWB5 | 7.0 | 12.04 |
| ZYF2U | 7.0 | 65.66 |
| ZZ2AO | 1.0 | 24.38 |
| ZZM1A | 26.0 | 164.21 |
| ZZNC5 | 1.0 | 10.79 |
4448 rows × 2 columns
# Reset the index
df = df.reset_index()
df
| Transaction_ID | SKU | Date | Customer_ID | Quantity | Sales_Amount | |
|---|---|---|---|---|---|---|
| 0 | 1 | 0EM7L | 01/02/2016 | 2547 | 1.0 | 3.13 |
| 1 | 10 | KWDJZ | 01/02/2016 | 7548 | 1.0 | 5.38 |
| 2 | 10 | UNJKW | 01/02/2016 | 7548 | 1.0 | 2.11 |
| 3 | 100 | MRE4J | 01/02/2016 | 3131 | 1.0 | 3.34 |
| 4 | 10000 | P1T5K | 03/01/2016 | 5810 | 1.0 | 15.67 |
| ... | ... | ... | ... | ... | ... | ... |
| 63985 | 9997 | DWYF6 | 03/01/2016 | 9165 | 1.0 | 11.93 |
| 63986 | 9998 | ZVTO4 | 03/01/2016 | 2557 | 1.0 | 4.57 |
| 63987 | 9999 | 16GCL | 03/01/2016 | 7820 | 1.0 | 26.43 |
| 63988 | 9999 | DGPZK | 03/01/2016 | 7820 | 1.0 | 10.15 |
| 63989 | 9999 | U26S2 | 03/01/2016 | 7820 | 1.0 | 7.25 |
63990 rows × 6 columns
Merge or join operations combine datasets by linking rows using one or more keys. These operations are particularly important in relational databases (e.g., SQL-based).
sku_categ = pd.read_csv("sku_categ.csv")
sku_categ.head()
| SKU | SKU_Category | |
|---|---|---|
| 0 | 0EM7L | X52 |
| 1 | 68BRQ | 2ML |
| 2 | CZUZX | 0H2 |
| 3 | 549KK | 0H2 |
| 4 | K8EHH | 0H2 |
# Merge two dataframes by key
df = pd.merge(df, sku_categ, on='SKU')
df
| Transaction_ID | SKU | Date | Customer_ID | Quantity | Sales_Amount | SKU_Category | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 0EM7L | 01/02/2016 | 2547 | 1.0 | 3.13 | X52 |
| 1 | 12476 | 0EM7L | 16/03/2016 | 17168 | 1.0 | 3.13 | X52 |
| 2 | 19516 | 0EM7L | 22/04/2016 | 15706 | 1.0 | 3.13 | X52 |
| 3 | 2018 | 0EM7L | 14/01/2016 | 8283 | 1.0 | 3.13 | X52 |
| 4 | 23824 | 0EM7L | 15/05/2016 | 19800 | 1.0 | 3.34 | X52 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 63985 | 9926 | Q309I | 03/01/2016 | 6436 | 1.0 | 4.06 | A0G |
| 63986 | 9956 | FN1CT | 03/01/2016 | 245 | 1.0 | 26.51 | LSD |
| 63987 | 9959 | 64NPA | 03/01/2016 | 1658 | 1.0 | 6.00 | JKC |
| 63988 | 9980 | TGW8R | 03/01/2016 | 44 | 1.0 | 3.85 | A38 |
| 63989 | 9992 | NVRQE | 03/01/2016 | 3275 | 1.0 | 6.25 | T80 |
63990 rows × 7 columns


df2 = pd.read_csv('sku_transactions_2.csv')
df2
| Date | Customer_ID | Transaction_ID | SKU_Category | SKU | Quantity | Sales_Amount | |
|---|---|---|---|---|---|---|---|
| 0 | 2016-07-01 | 5694 | 731 | 0H2 | 51OBI | 1.0 | 6.75 |
| 1 | 2016-07-01 | 5694 | 731 | 3WV | WKRVM | 1.0 | 4.75 |
| 2 | 2016-07-01 | 1886 | 732 | BZU | 68AG2 | 1.0 | 7.23 |
| 3 | 2016-07-01 | 1569 | 733 | FU5 | 3ZY0H | 1.0 | 5.01 |
| 4 | 2016-07-01 | 8837 | 734 | XVK | 01IEO | 1.0 | 4.96 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 67711 | 2016-12-31 | 16860 | 64622 | R6E | V6P7N | 1.0 | 2.13 |
| 67712 | 2016-12-31 | 16860 | 64622 | R6E | F90L2 | 1.0 | 2.49 |
| 67713 | 2016-12-31 | 16860 | 64622 | SFC | AM6EH | 1.0 | 2.86 |
| 67714 | 2016-12-31 | 17306 | 64616 | C8Z | 520UE | 1.0 | 8.49 |
| 67715 | 2016-12-31 | 13935 | 64609 | 0H2 | 6R0Z5 | 1.0 | 6.12 |
67716 rows × 7 columns
pd.concat([df, df2])
| Transaction_ID | SKU | Date | Customer_ID | Quantity | Sales_Amount | SKU_Category | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 0EM7L | 01/02/2016 | 2547 | 1.0 | 3.13 | X52 |
| 1 | 12476 | 0EM7L | 16/03/2016 | 17168 | 1.0 | 3.13 | X52 |
| 2 | 19516 | 0EM7L | 22/04/2016 | 15706 | 1.0 | 3.13 | X52 |
| 3 | 2018 | 0EM7L | 14/01/2016 | 8283 | 1.0 | 3.13 | X52 |
| 4 | 23824 | 0EM7L | 15/05/2016 | 19800 | 1.0 | 3.34 | X52 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 67711 | 64622 | V6P7N | 2016-12-31 | 16860 | 1.0 | 2.13 | R6E |
| 67712 | 64622 | F90L2 | 2016-12-31 | 16860 | 1.0 | 2.49 | R6E |
| 67713 | 64622 | AM6EH | 2016-12-31 | 16860 | 1.0 | 2.86 | SFC |
| 67714 | 64616 | 520UE | 2016-12-31 | 17306 | 1.0 | 8.49 | C8Z |
| 67715 | 64609 | 6R0Z5 | 2016-12-31 | 13935 | 1.0 | 6.12 | 0H2 |
131706 rows × 7 columns

import numpy as np
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
index=pd.Index(["Ohio", "Colorado"], name="state"),
columns=pd.Index(["one", "two", "three"],
name="number"))
data
| number | one | two | three |
|---|---|---|---|
| state | |||
| Ohio | 0 | 1 | 2 |
| Colorado | 3 | 4 | 5 |
# Stack dataframe
result = data.stack()
result
state number
Ohio one 0
two 1
three 2
Colorado one 3
two 4
three 5
dtype: int64
# Unstack
result.unstack()
| number | one | two | three |
|---|---|---|---|
| state | |||
| Ohio | 0 | 1 | 2 |
| Colorado | 3 | 4 | 5 |
By default, the innermost level is unstacked
# unstack different level
result.unstack(level=0)
| state | Ohio | Colorado |
|---|---|---|
| number | ||
| one | 0 | 3 |
| two | 1 | 4 |
| three | 2 | 5 |
import pandas as pd
import numpy as np
df = pd.DataFrame({"key1" : ["a", "a", None, "b", "b", "a", None],
"key2" : pd.Series([1, 2, 1, 2, 1, None, 1], dtype="Int64"),
"data1" : np.random.standard_normal(7),
"data2" : np.random.standard_normal(7)})
df
| key1 | key2 | data1 | data2 | |
|---|---|---|---|---|
| 0 | a | 1 | -0.831921 | -0.285859 |
| 1 | a | 2 | -0.775401 | -0.165066 |
| 2 | None | 1 | -0.912723 | 0.638457 |
| 3 | b | 2 | -0.815619 | -0.579768 |
| 4 | b | 1 | 0.677665 | 0.785730 |
| 5 | a | <NA> | 0.805057 | 0.293752 |
| 6 | None | 1 | 0.006672 | 0.336428 |
Compute the mean of the data1 column using the labels from key1
grouped = df["data1"].groupby(df["key1"])
grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fa672b27c10>
Did not compute anything yet. It is an object that has all of the information needed to apply some operation to each of the groups
# Compute the mean
grouped.mean()
key1 a -0.267422 b -0.068977 Name: data1, dtype: float64
# Groupby multiple keys(levels)
means = df["data1"].groupby([df["key1"], df["key2"]]).mean()
means
key1 key2
a 1 -0.831921
2 -0.775401
b 1 0.677665
2 -0.815619
Name: data1, dtype: float64
Any missing values in a group key are excluded from the result by default.
# Groupby multiple keys(levels)
means = df["data1"].groupby([df["key1"], df["key2"]], dropna=False).mean()
means
key1 key2
a 1 -0.831921
2 -0.775401
<NA> 0.805057
b 1 0.677665
2 -0.815619
NaN 1 -0.453026
Name: data1, dtype: float64
# Unstack the previous result
means.unstack()
| key2 | 1 | 2 | <NA> |
|---|---|---|---|
| key1 | |||
| a | -0.831921 | -0.775401 | 0.805057 |
| b | 0.677665 | -0.815619 | NaN |
| NaN | -0.453026 | NaN | NaN |
The object returned by groupby supports iteration, generating a sequence of 2-tuples containing the group name along with the chunk of data
for name, group in df.groupby("key1"):
print(name)
print(group)
a key1 key2 data1 data2 0 a 1 -0.831921 -0.285859 1 a 2 -0.775401 -0.165066 5 a <NA> 0.805057 0.293752 b key1 key2 data1 data2 3 b 2 -0.815619 -0.579768 4 b 1 0.677665 0.785730
people = pd.DataFrame(np.random.standard_normal((5, 5)),
columns=["a", "b", "c", "d", "e"],
index=["Joe", "Steve", "Wanda", "Jill", "Trey"])
people.iloc[2:3, [1, 2]] = np.nan
people
| a | b | c | d | e | |
|---|---|---|---|---|---|
| Joe | 0.268135 | -0.236627 | -0.992172 | -0.120391 | -1.556781 |
| Steve | 0.558533 | 0.603487 | 1.312584 | 1.081793 | 0.818569 |
| Wanda | 1.258828 | NaN | NaN | -0.946992 | -2.048267 |
| Jill | -0.268992 | 0.357191 | -1.020337 | 0.239886 | -0.251390 |
| Trey | 1.488786 | -0.584070 | -0.113982 | 0.882762 | 0.161310 |
# Group by columns
mapping = {"a": "red", "b": "red", "c": "blue", "d": "blue", "e": "red", "f" : "orange"}
people.groupby(mapping, axis="columns").sum()
| blue | red | |
|---|---|---|
| Joe | -1.112563 | -1.525273 |
| Steve | 2.394378 | 1.980589 |
| Wanda | -0.946992 | -0.789439 |
| Jill | -0.780450 | -0.163190 |
| Trey | 0.768780 | 1.066027 |
Aggregations refer to any data transformation that produces scalar values from arrays.

But you can define your own aggregations
def peak_to_peak(arr):
return arr.max() - arr.min()
df.groupby('key1').agg(peak_to_peak)
| key2 | data1 | data2 | |
|---|---|---|---|
| key1 | |||
| a | 1 | 1.636979 | 0.579612 |
| b | 1 | 1.493284 | 1.365498 |
df.groupby('key1').describe()
| key2 | data1 | data2 | |||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | ... | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| key1 | |||||||||||||||||||||
| a | 2.0 | 1.5 | 0.707107 | 1.0 | 1.25 | 1.5 | 1.75 | 2.0 | 3.0 | -0.267422 | ... | 0.014828 | 0.805057 | 3.0 | -0.052391 | 0.305793 | -0.285859 | -0.225463 | -0.165066 | 0.064343 | 0.293752 |
| b | 2.0 | 1.5 | 0.707107 | 1.0 | 1.25 | 1.5 | 1.75 | 2.0 | 2.0 | -0.068977 | ... | 0.304344 | 0.677665 | 2.0 | 0.102981 | 0.965553 | -0.579768 | -0.238393 | 0.102981 | 0.444356 | 0.785730 |
2 rows × 24 columns
df = pd.read_csv("ds_salaries.csv")
df.head()
#https://www.kaggle.com/datasets/zain280/data-science-salaries
| id | work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2020 | MI | FT | Data Scientist | 70000 | EUR | 79833 | DE | 0 | DE | L |
| 1 | 1 | 2020 | SE | FT | Machine Learning Scientist | 260000 | USD | 260000 | JP | 0 | JP | S |
| 2 | 2 | 2020 | SE | FT | Big Data Engineer | 85000 | GBP | 109024 | GB | 50 | GB | M |
| 3 | 3 | 2020 | MI | FT | Product Data Analyst | 20000 | USD | 20000 | HN | 0 | HN | S |
| 4 | 4 | 2020 | SE | FT | Machine Learning Engineer | 150000 | USD | 150000 | US | 50 | US | L |
Question: What is the average salary for 'Data Scientist' ?
df.groupby('job_title').mean().loc['Data Scientist']
# df['job_title'].value_counts()
id 314.832168 work_year 2021.391608 salary 508347.202797 salary_in_usd 108187.832168 remote_ratio 63.986014 Name: Data Scientist, dtype: float64
Question: Are salaries changing betwen 2020 to 2022? for all titles ?
df.groupby(['job_title','work_year']).median()
| id | salary | salary_in_usd | remote_ratio | ||
|---|---|---|---|---|---|
| job_title | work_year | ||||
| 3D Computer Vision Researcher | 2021 | 77.0 | 400000.0 | 5409.0 | 50.0 |
| AI Scientist | 2020 | 52.0 | 300000.0 | 45896.0 | 50.0 |
| 2021 | 178.5 | 33500.0 | 15026.5 | 100.0 | |
| 2022 | 498.5 | 160000.0 | 160000.0 | 50.0 | |
| Analytics Engineer | 2022 | 464.0 | 179850.0 | 179850.0 | 50.0 |
| ... | ... | ... | ... | ... | ... |
| Product Data Analyst | 2020 | 12.0 | 235000.0 | 13036.0 | 50.0 |
| Research Scientist | 2020 | 29.5 | 246000.0 | 246000.0 | 25.0 |
| 2021 | 198.5 | 64999.5 | 66904.5 | 50.0 | |
| 2022 | 502.5 | 102500.0 | 106713.5 | 50.0 | |
| Staff Data Scientist | 2021 | 283.0 | 105000.0 | 105000.0 | 100.0 |
98 rows × 4 columns
Question: Will you prefer to work in Large companies for high salary?
df.groupby('company_size').median()
| id | work_year | salary | salary_in_usd | remote_ratio | |
|---|---|---|---|---|---|
| company_size | |||||
| L | 194.5 | 2021.0 | 120250.0 | 100000.0 | 100.0 |
| M | 392.5 | 2022.0 | 116075.0 | 113188.0 | 100.0 |
| S | 149.0 | 2021.0 | 80000.0 | 65000.0 | 100.0 |
Question: Show the top 5 salaries from each job title
def find_top_5(arr):
return arr.sort_values('salary_in_usd', ascending=False)[:5]
df.groupby('job_title').apply(find_top_5).loc['Data Scientist']
| id | work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 63 | 63 | 2020 | SE | FT | Data Scientist | 412000 | USD | 412000 | US | 100 | US | L |
| 416 | 416 | 2022 | SE | FT | Data Scientist | 260000 | USD | 260000 | US | 100 | US | M |
| 486 | 486 | 2022 | SE | FT | Data Scientist | 230000 | USD | 230000 | US | 100 | US | M |
| 592 | 592 | 2022 | SE | FT | Data Scientist | 230000 | USD | 230000 | US | 100 | US | M |
| 472 | 472 | 2022 | SE | FT | Data Scientist | 220000 | USD | 220000 | US | 100 | US | M |